R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

## Rows: 100000 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): gender, smoking_history
## dbl (7): age, hypertension, heart_disease, bmi, HbA1c_level, blood_glucose_l...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Including Plots

## # A tibble: 100,000 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 58,552 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Female    36            0             0 current          23.4         5  
##  4 Female    20            0             0 never            27.3         6.6
##  5 Female    44            0             0 never            19.3         6.5
##  6 Female    79            0             0 No Info          23.9         5.7
##  7 Female    32            0             0 never            27.3         5  
##  8 Female    53            0             0 never            27.3         6.1
##  9 Female    54            0             0 former           54.7         6  
## 10 Female    78            0             0 former           36.0         5  
## # ℹ 58,542 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
##       n
##   <int>
## 1 27397
## # A tibble: 1 × 1
##       n
##   <int>
## 1 18865
## # A tibble: 1,267 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      67            0             1 not current      27.3         6.5
##  2 Male      57            1             1 not current      27.8         6.6
##  3 Male      80            0             1 former           24.4         7.5
##  4 Male      75            0             1 not current      28.1         7.5
##  5 Male      69            0             1 former           24.1         6.8
##  6 Female    59            0             1 never            60.3         8.8
##  7 Male      80            0             1 former           33.0         6  
##  8 Female    62            1             1 former           44.2         8.2
##  9 Female    62            1             1 never            43.2         8.8
## 10 Female    76            0             1 former           25.7         9  
## # ℹ 1,257 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 3,942 × 10
## # Groups:   bmi >= 30 [2]
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Male      76            1             1 current          20.1         4.8
##  3 Female    72            0             1 former           27.9         6.5
##  4 Male      67            0             1 not current      27.3         6.5
##  5 Female    77            1             1 never            32.0         5  
##  6 Female    59            0             1 ever             23.1         6.5
##  7 Male      68            1             1 current          27.3         5  
##  8 Male      59            0             1 ever             30.8         5  
##  9 Female    80            0             1 never            29.6         5.8
## 10 Male      57            1             1 not current      27.8         6.6
## # ℹ 3,932 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## #   `bmi >= 30` <lgl>
## # A tibble: 3,942 × 10
## # Groups:   bmi >= 30 [2]
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Male      76            1             1 current          20.1         4.8
##  3 Female    72            0             1 former           27.9         6.5
##  4 Male      67            0             1 not current      27.3         6.5
##  5 Female    77            1             1 never            32.0         5  
##  6 Female    59            0             1 ever             23.1         6.5
##  7 Male      68            1             1 current          27.3         5  
##  8 Male      59            0             1 ever             30.8         5  
##  9 Female    80            0             1 never            29.6         5.8
## 10 Male      57            1             1 not current      27.8         6.6
## # ℹ 3,932 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## #   `bmi >= 30` <lgl>
## # A tibble: 1,903 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      50            0             0 former           37.2         9  
##  2 Male      53            0             0 current          30.8         6.6
##  3 Male      76            0             0 never            31.9         7.5
##  4 Male      63            1             0 ever             35.1         5.8
##  5 Male      48            1             0 current          36.1         6.8
##  6 Male      37            0             0 never            37.2         7  
##  7 Male      36            0             0 not current      46.1         6.2
##  8 Male      50            0             0 never            31.8         7.5
##  9 Male      43            0             0 never            69.4         7.5
## 10 Male      43            1             0 not current      40.9         6.6
## # ℹ 1,893 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1903
## # A tibble: 2,330 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    67            0             0 never            63.5         8.8
##  2 Female    36            0             0 current          32.3         6.2
##  3 Female    77            0             0 never            31.7         6.5
##  4 Female    47            0             0 never            36.5         7.5
##  5 Female    61            0             0 not current      39.4         9  
##  6 Female    80            0             0 former           36.2         6.5
##  7 Female    52            1             0 never            50.3         6.6
##  8 Female    68            0             0 No Info          40.3         7.5
##  9 Female    70            0             0 not current      33.2         7.5
## 10 Female    67            0             0 former           32.3         7  
## # ℹ 2,320 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
##       n
##   <int>
## 1  2330
## # A tibble: 21 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      42            0             0 current          11.9         6  
##  2 Male       6            0             0 never            15.7         6.1
##  3 Male      71            1             0 former           13.2         6.6
##  4 Male      14            0             0 never            19.0         6.6
##  5 Male      54            0             0 never            18.9         6  
##  6 Male      61            1             0 never            18.4         6.5
##  7 Male       4            0             0 never            18.7         6  
##  8 Male      51            0             0 current          17.8         6.2
##  9 Male      80            1             0 current          19.0         6.6
## 10 Male       6            0             0 No Info          15.6         9  
## # ℹ 11 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
##       n
##   <int>
## 1    21
## # A tibble: 57 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    79            0             0 not current      18.1         7  
##  2 Female     4            0             0 No Info          15.0         6.5
##  3 Female    51            0             0 current          17.4         7  
##  4 Female     9            0             0 never            16           6.1
##  5 Female    60            0             0 No Info          17.9         8.2
##  6 Female    13            0             0 No Info          17.3         6.2
##  7 Female    80            0             0 never            17.4         6.5
##  8 Female     8            0             0 No Info          14.3         7.5
##  9 Female    80            0             0 never            17.8         6.2
## 10 Female    78            1             0 not current      17.7         8.8
## # ℹ 47 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
##       n
##   <int>
## 1    57
## # A tibble: 7,445 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      42            0             0 never            33.6         4.8
##  2 Male      15            0             0 never            30.4         6.1
##  3 Male      40            0             0 current          36.4         6  
##  4 Male      30            0             0 never            33.8         6.1
##  5 Male      34            0             0 never            31.2         5.8
##  6 Male      54            0             0 never            31.9         6.6
##  7 Male      79            0             0 former           31.2         5.8
##  8 Male      54            0             0 former           32.8         5  
##  9 Male      38            0             0 never            55.6         6.5
## 10 Male      58            0             0 former           36.5         5.8
## # ℹ 7,435 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
##       n
##   <int>
## 1  7445
## # A tibble: 11,852 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    54            0             0 former           54.7         6  
##  2 Female    78            0             0 former           36.0         5  
##  3 Female    53            0             0 No Info          31.8         4  
##  4 Female    34            0             0 never            56.4         6.2
##  5 Female    77            1             1 never            32.0         5  
##  6 Female    27            0             0 not current      30.2         5.7
##  7 Female    37            0             0 No Info          30.5         5.7
##  8 Female    56            0             0 never            31.0         6.5
##  9 Female    44            0             0 never            37.4         5.7
## 10 Female    30            0             0 No Info          50.1         6  
## # ℹ 11,842 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
##       n
##   <int>
## 1 11852

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot. Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

In this section we want to display a stack bar showing how HbA1c levels (average blood sugar levels) can classify Males and Females in different categories such as Normal, Prediabetes, and Diabetes. From this plot we gain insight on how the distribution of Normal, Prediabetes, and Diabetes varies between both genders.

First we load our libraries such as dplyr for data manipulation, ggplot2 for data visualization, and plotly to make our plot interactive. Then we we display our original dataset.

From our original data set we create a new dataset called HbA1c_by_gender. For our new dataset we want to keep only the Male and Female gender and exclude Other, the way to do this is by filtering our gender column to not equal to ‘Other’. We also create a new variable called HbA1c_category with the use of our mutate function. We use the case_when function to classify our HbA1c_level column and return different categories such as ‘Normal’, ‘Prediabetes’, and ‘Diabetes’.

Then we print out our mutated dataset to make a comparison of the original.

In the next section we want our stack bar to follow a certain order, we want Normal at the top, Prediabetes at the middle, and Diabetes at the bottom. To do this we will mutate our HbA1c_category column to an ordered categorical variable with the use of the factor function, the levels argument helps to set the order we want.

Next, we will plot a stack bar using ggplot. Before that we will change the name of our data set to ‘Interactive_mode’ that way we can easily incorporate our data set to ggplotly. The scale_fill_manual function is used to manually assign colors to the different categories.

Lastly, with the use of plotly we transform our plot into an interactive plot. When approaching the table we are able to see the count, gender, and HbA1c_category for any of the stacked bars.

## Warning: package 'plotly' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## # A tibble: 100,000 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 99,982 × 10
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,972 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## #   HbA1c_category <chr>
library(ggplot2)

# Example dataset (replace with actual data)
data <- data.frame(
  gender = factor(rep(c("Male", "Female", "Other"), each = 50)),
  bmi = c(runif(50, 18, 40), runif(50, 18, 40), runif(50, 18, 40)), 
  diabetes = factor(sample(c("Yes", "No"), 150, replace = TRUE))
)

# Creating the plot
ggplot(data, aes(x = gender, y = bmi, fill = diabetes)) +
  geom_boxplot() +
  labs(title = "BMI Distribution by Gender and Diabetes Status",
       x = "Gender",
       y = "BMI") +
  theme_minimal()

# here I'll leave extra info for you guys regarding the gender column of the original data set
diabetes_dataset %>% filter(gender == 'Female') %>% tally # 58,552 we have 17,122 more females than males in this data set
## # A tibble: 1 × 1
##       n
##   <int>
## 1 58552
diabetes_dataset %>% filter(gender == 'Male') %>% tally   # 41,430  
## # A tibble: 1 × 1
##       n
##   <int>
## 1 41430
diabetes_dataset %>% filter(gender == 'Other') %>% tally  # 18  
## # A tibble: 1 × 1
##       n
##   <int>
## 1    18
# check why most ppl dont have 5.3 because we are working with a bunch of ppl ****
# turn heart disease into a category
# when we graph we see that this data may not be legit bc ppl who are older that 80 seem to be pushed out 



``` r
library(dplyr)
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
diabetes_dataset %>%
  filter(age >= 3) %>%  # Ensure ages 3-80 are included
  select(age, diabetes, heart_disease) %>%
  pivot_longer(cols = c(diabetes, heart_disease), names_to = "condition", values_to = "status") %>%
  filter(status == 1) %>%  # Keep only cases where the condition is present
  ggplot(aes(x = age, fill = condition)) +
  geom_density(alpha = 0.6, adjust = 1.5) +
  scale_x_continuous(limits = c(3, 80), breaks = seq(3, 80, by = 7)) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +  # Show density as percentages
  scale_fill_manual(values = c("red", "purple"), labels = c("Diabetes", "Heart Disease")) +
  labs(title = "Density of Diabetes & Heart Disease Across Age Groups",
       x = "Age", y = "Percentage Density", fill = "Condition") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

diabetes_dataset
## # A tibble: 100,000 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
diabetes_dataset %>% select(age, diabetes, heart_disease) %>%  filter(age >= 3, diabetes == 1, heart_disease == 1) %>% arrange(age) %>%  ggplot(aes(x = age)) +
  geom_density(alpha = 0.6, adjust = 1.5) +
   theme_minimal()

diabetes_dataset
## # A tibble: 100,000 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>%  filter(age >= 2, diabetes == 1)
heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(age >= 2, heart_disease == 1)
bmi_older_than_two <- diabetes_dataset %>% select(age, bmi) %>% filter(age >= 2)

ggplot() +
  geom_density(data = diabetes_only, aes(x = age), fill = "blue", alpha = 0.5) +  # Diabetes cases
  geom_density(data = heart_disease_only, aes(x = age), fill = "red", alpha = 0.5) + # Heart disease only
  geom_density(data = bmi_older_than_two, aes(x = age), fill = "magenta", alpha = 0.5) +  
  labs(title = "Age Distribution: Diabetes vs. Heart Disease",
       x = "Age",
       y = "Density") +
  theme_minimal() 

3666 #10.01 smallest, largest

bmi_older_than_two <- diabetes_dataset %>% select(age, bmi) %>% filter(age >= 2, bmi == ‘NA’) %>% arrange(age)

diabetes_dataset %>% select(age, diabetes) %>% filter(diabetes == 1) %>% arrange(-age) #3 yo the youngest, oldest 80

diabetes_dataset %>% select(age, heart_disease) %>% filter(heart_disease == 1) %>% arrange(-age) # youngest 2, oldest 80

min age 6 (only 1 person)

max 80

1,267 X 3

library(ggplot2) library(dplyr)

Create a new condition column

diabetes_dataset_condition <- diabetes_dataset %>% mutate(condition = case_when( diabetes == 1~ “Diabetes Only”, heart_disease == 1 ~ “Heart Disease Only” )) %>%

filter(!is.na(condition))

Scatter plot for Age vs. BMI, color-coded by condition

ggplot(diabetes_dataset_condition) + geom_point(aes(x = age, y = bmi, color = condition), alpha = 0.2) + # Scatter plot labs(title = “BMI vs. Age Across Diabetes & Heart Disease”, x = “Age”, y = “BMI”, color = “Condition”) + theme_minimal()

library(ggplot2) library(dplyr) #summary(diabetes_dataset\(bmi) #diabetes_dataset\)bmi <- as.numeric(diabetes_dataset$bmi)

diabetes_dataset_filtered_bmi <- diabetes_dataset %>% mutate(bmi_filtered = ifelse(bmi >= 2, bmi, NA))

diabetes_only <- diabetes_dataset_filtered_bmi %>% select(age, bmi_filtered, diabetes) %>% filter(age >= 2, diabetes == 1) %>% mutate(condition = “Diabetes Only”)

heart_disease_only <- diabetes_dataset_filtered_bmi %>% select(age, bmi_filtered, heart_disease) %>% filter(age >= 2, heart_disease == 1) %>% mutate(condition = “Heart Disease Only”)

combined_data <- bind_rows(diabetes_only, heart_disease_only)

ggplot(combined_data) + geom_point(aes(x = age, y = bmi_filtered, color = condition), alpha = 0.2) + # Scatter plot with BMI geom_jitter(aes(x = age, y = bmi_filtered, color = condition), width = 0.1, height = 0.1, alpha = 0.3) + scale_color_manual(values = c(“Diabetes Only” = “deeppink”, “Heart Disease Only” = “darkblue”)) + geom_smooth(aes(x = age, y = bmi_filtered), method = “loess”, size = 0.8, color = “red”, se = FALSE) + # Single tr0end line scale_x_continuous(breaks = seq(0, 80, by = 10)) + scale_y_continuous(breaks = seq(10.01, 95.69, by = 10)) + labs(title = “BMI vs. Age Across Diabetes & Heart Disease”, x = “Age”, y = “BMI”, color = “Condition”) + theme_bw() + theme(plot.title = element_text(hjust = 0.5))

library(ggplot2) library(dplyr)

diabetes_dataset_filtered_bmi <- diabetes_dataset %>% mutate(bmi_filtered = ifelse(bmi >= 2, bmi, NA))

diabetes_only <- diabetes_dataset_filtered_bmi %>% select(age, bmi_filtered, diabetes) %>% filter(age >= 2, diabetes == 1) %>% mutate(condition = “Diabetes Only”)

heart_disease_only <- diabetes_dataset_filtered_bmi %>% select(age, bmi_filtered, heart_disease) %>% filter(age >= 2, heart_disease == 1) %>% mutate(condition = “Heart Disease Only”)

combined_data <- bind_rows(diabetes_only, heart_disease_only)

ggplot(combined_data) + geom_point(aes(x = age, y = bmi_filtered, color = condition), alpha = 0.3) + # Scatter plot with BMI geom_jitter(aes(x = age, y = bmi_filtered, color = condition), width = 0.1, height = 0.1, alpha = 0.3) + scale_color_manual(values = c(“Diabetes Only” = “cornflowerblue”, “Heart Disease Only” = “darkorchid4”)) + geom_smooth(aes(x = age, y = bmi_filtered), method = “loess”, size = 1, color = “red”, se = FALSE) + # Single tr0end line scale_x_continuous(breaks = seq(0, 80, by = 10)) + scale_y_continuous(breaks = seq(10.01, 95.69, by = 10)) + labs(title = “BMI vs. Age Across Diabetes & Heart Disease”, x = “Age”, y = “BMI”, color = “Condition”) + theme_bw() + theme(plot.title = element_text(hjust = 0.5)) #ggplotly(interactive_combined_data) ####

bmi_older_than_two <- diabetes_dataset %>% select(age, bmi) %>% filter(age >= 2, bmi == ‘N/A’) # min bmi 10.01 # max bmi 95.69 diabetes_dataset_filtered_bmi <- diabetes_dataset %>% mutate(bmi_filtered = ifelse(bmi >= 2, bmi, NA)) diabetes_dataset_filtered_bmi %>% arrange(-bmi_filtered)



```{=html}
<div class="plotly html-widget html-fill-item" id="htmlwidget-a6376b90d3fb62343749" style="width:672px;height:576px;"></div>
<script type="application/json" data-for="htmlwidget-a6376b90d3fb62343749">{"x":{"data":[{"orientation":"v","width":[0.89999999999999991,0.90000000000000013],"base":[0.6158628227899986,0.62930243784697082],"x":[1,2],"y":[0.3841371772100014,0.37069756215302918],"text":["gender: Female<br />n: 0.3841372<br />HbA1c_category: Normal < 5.7%","gender: Male<br />n: 0.3706976<br />HbA1c_category: Normal < 5.7%"],"type":"bar","textposition":"none","marker":{"autocolorscale":false,"color":"rgba(255,248,220,0.6)","line":{"width":0.37795275590551186,"color":"rgba(0,0,0,1)"}},"name":"Normal < 5.7%","legendgroup":"Normal < 5.7%","showlegend":true,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null},{"orientation":"v","width":[0.89999999999999991,0.90000000000000013],"base":[0.20212802295395546,0.21624426743905384],"x":[1,2],"y":[0.41373479983604311,0.41305817040791698],"text":["gender: Female<br />n: 0.4137348<br />HbA1c_category: Prediabetes 5.7% - 6.4%","gender: Male<br />n: 0.4130582<br />HbA1c_category: Prediabetes 5.7% - 6.4%"],"type":"bar","textposition":"none","marker":{"autocolorscale":false,"color":"rgba(189,183,107,0.6)","line":{"width":0.37795275590551186,"color":"rgba(0,0,0,1)"}},"name":"Prediabetes 5.7% - 6.4%","legendgroup":"Prediabetes 5.7% - 6.4%","showlegend":true,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null},{"orientation":"v","width":[0.89999999999999991,0.90000000000000013],"base":[0,0],"x":[1,2],"y":[0.20212802295395546,0.21624426743905384],"text":["gender: Female<br />n: 0.2021280<br />HbA1c_category: Diabetes ≥ 6.5%","gender: Male<br />n: 0.2162443<br />HbA1c_category: Diabetes ≥ 6.5%"],"type":"bar","textposition":"none","marker":{"autocolorscale":false,"color":"rgba(139,101,8,0.6)","line":{"width":0.37795275590551186,"color":"rgba(0,0,0,1)"}},"name":"Diabetes ≥ 6.5%","legendgroup":"Diabetes ≥ 6.5%","showlegend":true,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null},{"x":[1,2],"y":[0.80793141139499935,0.81465121892348535],"text":["38.4%","37.1%"],"hovertext":["gender: Female<br />n: 22492<br />HbA1c_category: Normal < 5.7%<br />paste0(round(percent, 1), \"%\"): 38.4%","gender: Male<br />n: 15358<br />HbA1c_category: Normal < 5.7%<br />paste0(round(percent, 1), \"%\"): 37.1%"],"textfont":{"size":14.66456692913386,"color":"rgba(0,0,0,1)"},"type":"scatter","mode":"text","hoveron":"points","name":"Normal < 5.7%","legendgroup":"Normal < 5.7%","showlegend":false,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null},{"x":[1,2],"y":[0.40899542287197704,0.4227733526430123],"text":["41.4%","41.3%"],"hovertext":["gender: Female<br />n: 24225<br />HbA1c_category: Prediabetes 5.7% - 6.4%<br />paste0(round(percent, 1), \"%\"): 41.4%","gender: Male<br />n: 17113<br />HbA1c_category: Prediabetes 5.7% - 6.4%<br />paste0(round(percent, 1), \"%\"): 41.3%"],"textfont":{"size":14.66456692913386,"color":"rgba(0,0,0,1)"},"type":"scatter","mode":"text","hoveron":"points","name":"Prediabetes 5.7% - 6.4%","legendgroup":"Prediabetes 5.7% - 6.4%","showlegend":false,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null},{"x":[1,2],"y":[0.10106401147697773,0.10812213371952692],"text":["20.2%","21.6%"],"hovertext":["gender: Female<br />n: 11835<br />HbA1c_category: Diabetes ≥ 6.5%<br />paste0(round(percent, 1), \"%\"): 20.2%","gender: Male<br />n:  8959<br />HbA1c_category: Diabetes ≥ 6.5%<br />paste0(round(percent, 1), \"%\"): 21.6%"],"textfont":{"size":14.66456692913386,"color":"rgba(0,0,0,1)"},"type":"scatter","mode":"text","hoveron":"points","name":"Diabetes ≥ 6.5%","legendgroup":"Diabetes ≥ 6.5%","showlegend":false,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null}],"layout":{"margin":{"t":42.057838660578383,"r":7.3059360730593621,"b":38.477929984779308,"l":48.949771689497723},"plot_bgcolor":"rgba(255,255,255,1)","paper_bgcolor":"rgba(255,255,255,1)","font":{"color":"rgba(0,0,0,1)","family":"","size":14.611872146118724},"title":{"text":"Male vs. Female Blood Sugar Levels (HbA1c)","font":{"color":"rgba(0,0,0,1)","family":"","size":17.534246575342465},"x":0.5,"xref":"paper"},"xaxis":{"domain":[0,1],"automargin":true,"type":"linear","autorange":false,"range":[0.40000000000000002,2.6000000000000001],"tickmode":"array","ticktext":["Female","Male"],"tickvals":[1,2],"categoryorder":"array","categoryarray":["Female","Male"],"nticks":null,"ticks":"outside","tickcolor":"rgba(51,51,51,1)","ticklen":3.6529680365296811,"tickwidth":0.66417600664176002,"showticklabels":true,"tickfont":{"color":"rgba(77,77,77,1)","family":"","size":11.68949771689498},"tickangle":-0,"showline":true,"linecolor":"rgba(0,0,0,1)","linewidth":0.66417600664176002,"showgrid":false,"gridcolor":null,"gridwidth":0,"zeroline":false,"anchor":"y","title":{"text":"Gender","font":{"color":"rgba(0,0,0,1)","family":"","size":14.611872146118724}},"hoverformat":".2f"},"yaxis":{"domain":[0,1],"automargin":true,"type":"linear","autorange":false,"range":[-0.050000000000000003,1.05],"tickmode":"array","ticktext":["0.00","0.25","0.50","0.75","1.00"],"tickvals":[0,0.25,0.5,0.75,1],"categoryorder":"array","categoryarray":["0.00","0.25","0.50","0.75","1.00"],"nticks":null,"ticks":"outside","tickcolor":"rgba(51,51,51,1)","ticklen":3.6529680365296811,"tickwidth":0.66417600664176002,"showticklabels":true,"tickfont":{"color":"rgba(77,77,77,1)","family":"","size":11.68949771689498},"tickangle":-0,"showline":true,"linecolor":"rgba(0,0,0,1)","linewidth":0.66417600664176002,"showgrid":false,"gridcolor":null,"gridwidth":0,"zeroline":false,"anchor":"x","title":{"text":"Proportion","font":{"color":"rgba(0,0,0,1)","family":"","size":14.611872146118724}},"hoverformat":".2f"},"shapes":[{"type":"rect","fillcolor":null,"line":{"color":null,"width":0,"linetype":[]},"yref":"paper","xref":"paper","x0":0,"x1":1,"y0":0,"y1":1}],"showlegend":true,"legend":{"bgcolor":"rgba(255,255,255,1)","bordercolor":"transparent","borderwidth":1.8897637795275593,"font":{"color":"rgba(0,0,0,1)","family":"","size":11.68949771689498},"title":{"text":"HbA1c Category","font":{"color":"rgba(0,0,0,1)","family":"","size":14.611872146118724}}},"hovermode":"closest","barmode":"relative"},"config":{"doubleClick":"reset","modeBarButtonsToAdd":["hoverclosest","hovercompare"],"showSendToCloud":false},"source":"A","attrs":{"5ff0750a5ba2":{"x":{},"y":{},"fill":{},"type":"bar"},"5ff02bf4b6c":{"x":{},"y":{},"fill":{},"label":{}}},"cur_data":"5ff0750a5ba2","visdat":{"5ff0750a5ba2":["function (y) ","x"],"5ff02bf4b6c":["function (y) ","x"]},"highlight":{"on":"plotly_click","persistent":false,"dynamic":false,"selectize":false,"opacityDim":0.20000000000000001,"selected":{"opacity":1},"debounce":0},"shinyEvents":["plotly_hover","plotly_click","plotly_selected","plotly_relayout","plotly_brushed","plotly_brushing","plotly_clickannotation","plotly_doubleclick","plotly_deselect","plotly_afterplot","plotly_sunburstclick"],"base_url":"https://plot.ly"},"evals":[],"jsHooks":[]}</script>

make plot on this !!!!!!!!!!!!!!!!!!!!!!

diabetes_dataset

diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>% filter(age >= 2, diabetes == 1) heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(age >= 2, heart_disease == 1) bmi_older_than_two <- diabetes_dataset %>% select(age, bmi) %>% filter(age >= 2)

ggplot() + geom_density(data = diabetes_only, aes(x = age), fill = “blue”, alpha = 0.5) + # Diabetes cases geom_density(data = heart_disease_only, aes(x = age), fill = “red”, alpha = 0.5) + # Heart disease only geom_density(data = bmi_older_than_two, aes(x = age), fill = “magenta”, alpha = 0.5) +
labs(title = “Age Distribution: Diabetes vs. Heart Disease”, x = “Age”, y = “Density”) + theme_minimal()

diabetes_dataset

diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>% filter(diabetes == 1) #y: 3 o: 80 dia_count <- diabetes_only %>% tally() dia_count

blood_glucose_dataset <- diabetes_dataset %>% select(age, blood_glucose_level) %>% filter(blood_glucose_level != ‘NA’, age >= 2) #y:2 o: 80 bg_count <- blood_glucose_dataset %>% tally() bg_count

heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(heart_disease == 1) #y: 2 o: 80 hd_count <- heart_disease_only %>% tally() hd_count

all_in_one <- diabetes_dataset %>% select(age, diabetes, blood_glucose_level,heart_disease) %>% filter(diabetes == 1, blood_glucose_level != ‘NA’, age >= 2, heart_disease == 1) all_in_one

ggplot() + geom_density(data = diabetes_only, aes(x = blood_glucose_level), fill = “blue”, alpha = 0.5) + # Diabetes cases geom_density(data = blood_glucose_dataset, aes(x = blood_glucose_level), fill = “red”, alpha = 0.5) + # Blood glucose levels geom_density(data = heart_disease_only, aes(x = blood_glucose_level), fill = “magenta”, alpha = 0.5) + # Heart disease cases labs(title = “Blood Glucose Distribution: Diabetes vs. Heart Disease”, x = “Blood Glucose Level”, y = “Density”) + theme_minimal()

Load necessary libraries

Load necessary libraries

library(ggplot2) library(dplyr) library(DT)

Filter datasets properly

diabetes_only <- diabetes_dataset %>% select(age, diabetes, blood_glucose_level) %>% filter(diabetes == 1, !is.na(blood_glucose_level))

blood_glucose_dataset <- diabetes_dataset %>% select(age, blood_glucose_level) %>%
filter(!is.na(blood_glucose_level), age >= 2)

heart_disease_only <- diabetes_dataset %>% select(age, heart_disease, blood_glucose_level) %>% filter(heart_disease == 1, !is.na(blood_glucose_level))

Create density plot

ggplot(diabetes_dataset, aes(x = blood_glucose_level, fill = after_stat(density))) + geom_density(alpha = 0.5) +
geom_density(data = blood_glucose_dataset, aes(x = blood_glucose_level), fill = “red”, alpha = 0.5) +
geom_density(data = heart_disease_only, aes(x = blood_glucose_level), fill = “magenta”, alpha = 0.5) +
labs(title = “Blood Glucose Density: Diabetes vs. Heart Disease”, x = “Blood Glucose Level”, y = “Density”) + theme_minimal()

diabetes_dataset
## # A tibble: 100,000 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>%  filter(age >= 2, diabetes == 1)
heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(age >= 2, heart_disease == 1)
bmi_older_than_two <- diabetes_dataset %>% select(age, bmi) %>% filter(age >= 2)

ggplot() +
  geom_density(data = diabetes_only, aes(x = age), fill = "blue", alpha = 0.5) +  # Diabetes cases
  geom_density(data = heart_disease_only, aes(x = age), fill = "red", alpha = 0.5) + # Heart disease only
  #geom_density(data = bmi_older_than_two, aes(x = age), fill = "magenta", alpha = 0.5) +  
  labs(title = "Age Distribution: Diabetes vs. Heart Disease",
       x = "Age",
       y = "Density") +
  theme_minimal() 

diabetes_dataset
## # A tibble: 100,000 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>%  filter(age >= 2, diabetes == 1)
heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(age >= 2, heart_disease == 1)
former_smoker_only <- diabetes_dataset %>% select(age, smoking_history) %>%  filter(age >= 7, smoking_history == 'former')

ggplot() +
  geom_density(data = diabetes_only, aes(x = age), fill = "blue", alpha = 0.5) +  # Diabetes cases
  geom_density(data = heart_disease_only, aes(x = age), fill = "red", alpha = 0.5) + # Heart disease only
  labs(title = "Age Distribution: Diabetes vs. Heart Disease",
       x = "Age",
       y = "Density") +
  theme_minimal()

former_smoker_only <- diabetes_dataset %>% select(age, smoking_history) %>%  filter(smoking_history == 'former')
#y: 7, o:80

##################!!!!!!!!!!!!!!!!!!!!

#diabetes_dataset %>% select(age, diabetes) %>% filter(diabetes == 1) %>% arrange(age)
# y: 3 o: 80
#diabetes_dataset %>% select(age,heart_disease) %>% filter(heart_disease == 1) %>% arrange(age)
# y: 2 o: 80
library(ggplot2)
library(dplyr)
#install.packages("ggthemes")
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.4.3
diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>%  filter(age >= 3, diabetes == 1)
heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(age >= 2, heart_disease == 1)
former_smoker_only <- diabetes_dataset %>% select(age, smoking_history) %>%  filter(age >= 7, smoking_history == 'former')

ggplot() +
  theme_stata() +
  geom_density(data = diabetes_only, aes(x = age), fill = "blue", alpha = 0.5) +  # Diabetes cases
  geom_density(data = heart_disease_only, aes(x = age), fill = "red", alpha = 0.5) + # Heart disease only
  geom_density(data = former_smoker_only, aes(x = age), fill = "cyan", alpha = 0.5)

  labs(title = "Age Distribution: Diabetes vs. Heart Disease",
       x = "Age",
       y = "Density")
## $x
## [1] "Age"
## 
## $y
## [1] "Density"
## 
## $title
## [1] "Age Distribution: Diabetes vs. Heart Disease"
## 
## attr(,"class")
## [1] "labels"

create horizontal lines with this info

Prediabetes & Type 2 Diabetes Risk:Often associated with BMI ≥25 (overweight or obese categories).

Normal-Weight Diabetes: Some individuals develop diabetes even with a BMI <25, due to factors like genetics, visceral fat, and metabolic health

Higher BMI & Diabetes Progression: A BMI ≥30 significantly increases the risk of insulin resistance and complications

```